import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
stop_search = pd.read_csv("D:/Stop_and_Search__Field_Interviews_.csv")
stop_search.head()
# check rows and columns
stop_search.shape
# check datatype for each feature
print(stop_search.select_dtypes(include = ['float64']).dtypes)
print(stop_search.select_dtypes(include = ['int64']).dtypes)
print(stop_search.select_dtypes(include = ['object']).dtypes)
# check missing values for each feature
print("Total number of missing values are", stop_search.isnull().sum().sum())
miss_df = pd.DataFrame({'Column': stop_search.isnull().sum().index,
'Num of Missing': stop_search.isnull().sum()}, index = None)
miss_df.set_index('Column', inplace = True)
miss_df = miss_df.rename_axis(None)
miss_df.sort_values(by = ['Num of Missing'], ascending = [False])
(1) Demographics; (2) Vehicle information; (3) Locations; (4) Time; (5) Stop type; (6) Stop result.
(1) Who has the high / lower probabilities of being stopped
(1) Make a good and accurate stop and search based on prediction; (2) Explore factors impacting a good stop and search.
(1) Demographics - SubjectRace, SubjectGender, SubjectAge, SubjectHeight, SubjectWeight, SubjectEyeColor, SubjectHairColor.
fig = plt.figure(dpi = 100, figsize = (30, 60))
ax1 = fig.add_subplot(4, 2, 1)
ax1.plot = sns.countplot(stop_search["SubjectRace"])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Race", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax2 = fig.add_subplot(4, 2, 2)
ax2.plot = sns.countplot(stop_search["SubjectGender"])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Gender", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax3 = fig.add_subplot(4, 2, 3)
ax3.plot = sns.distplot(stop_search["SubjectAge"][~ np.isnan(stop_search["SubjectAge"])], bins = 50)
sns.plt.xlim(0, 100)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Age", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax4 = fig.add_subplot(4, 2, 4)
ax4.plot = sns.distplot(stop_search["SubjectHeight"][~ np.isnan(stop_search["SubjectHeight"])], bins = 50)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
sns.plt.xlim(0, 100)
plt.xlabel("Height", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax5 = fig.add_subplot(4, 2, 5)
ax5.plot = sns.distplot(stop_search["SubjectWeight"][~ np.isnan(stop_search["SubjectWeight"])], bins = 50)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
sns.plt.xlim(0, 500)
plt.xlabel("Weight", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax7 = fig.add_subplot(4, 2, 6)
ax7.plot = sns.countplot(stop_search["SubjectEyeColor"])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Eye Color", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax8 = fig.add_subplot(4, 2, 7)
ax8.plot = sns.countplot(stop_search["SubjectHairColor"])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Hair Color", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax8 = fig.add_subplot(4, 2, 8)
ax8.plot = sns.countplot(stop_search["SubjectHasPhotoID"])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Has photo ID or not", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
fig.show()
(2) Vehicle information - VehicleModel, VehicleStyle, VehicleYear, VehicleColor, VehicleMake, SubjectDriverLicState.
# get frequency for uniques of each feature
print(stop_search["VehicleYear"].value_counts())
print(stop_search["VehicleModel"].value_counts())
print(stop_search["VehicleStyle"].value_counts())
print(stop_search["VehicleColor"].value_counts())
print(stop_search["VehicleMake"].value_counts())
print(stop_search["SubjectDriverLicState"].value_counts())
fig = plt.figure(dpi = 100, figsize = (30, 60))
ax1 = fig.add_subplot(4, 2, 1)
ax1.plot = sns.countplot(stop_search["VehicleModel"])
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.xlabel("Vehicle Model", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax2 = fig.add_subplot(4, 2, 2)
ax2.plot = sns.countplot(stop_search["VehicleStyle"])
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.xlabel("Vehicle Style", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax3 = fig.add_subplot(4, 2, 3)
ax3.plot = sns.distplot(stop_search["VehicleYear"][~ np.isnan(stop_search["VehicleYear"])], bins = 50)
sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Vehicle Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax4 = fig.add_subplot(4, 2, 4)
ax4.plot = sns.countplot(stop_search["VehicleColor"])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Vehicle Color", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax5 = fig.add_subplot(4, 2, 5)
ax5.plot = sns.countplot(stop_search["VehicleMake"])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Vehicle Make", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax7 = fig.add_subplot(4, 2, 6)
ax7.plot = sns.countplot(stop_search["SubjectDriverLicState"])
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.xlabel("Subject Driver License State", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
fig.show()
(3) Locations - Zip, BlockAddress, OfficerAssignment, Zone, District, Longitude, Latitude
print(stop_search["Zip"].value_counts())
print(stop_search["BlockAddress"].value_counts())
print(stop_search["OfficerAssignment"].value_counts())
print(stop_search["Zone"].value_counts())
print(stop_search["District"].value_counts())
fig = plt.figure(dpi = 100, figsize = (30, 60))
ax1 = fig.add_subplot(4, 2, 1)
ax1.plot = sns.distplot(stop_search["Zip"][~ np.isnan(stop_search["Zip"])])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Zip Code", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax2 = fig.add_subplot(4, 2, 2)
ax2.plot = sns.countplot(stop_search["OfficerAssignment"])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Officer Assignment", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax4 = fig.add_subplot(4, 2, 3)
ax4.plot = sns.countplot(stop_search["Zone"])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Zone", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax5 = fig.add_subplot(4, 2, 4)
ax5.plot = sns.countplot(stop_search["District"])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("District", fontsize = 40)
plt.ylabel("Count", fontsize = 40)
ax3 = fig.add_subplot(4, 2, 5)
ax3.plot = sns.distplot(stop_search["Longitude"][~ np.isnan(stop_search["Longitude"])], bins = 100)
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Longitude", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax3 = fig.add_subplot(4, 2, 6)
ax3.plot = sns.distplot(stop_search["Latitude"][~ np.isnan(stop_search["Latitude"])], bins = 100)
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Latitude", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
fig.show()
(4) Time - LastModifiedDateTime, CreatedDateTime, EventDate
# LastModifiedDateTime
LastModifiedDateTime = pd.to_datetime(stop_search['LastModifiedDateTime'])
LastModifiedYear = LastModifiedDateTime.dt.year
LastModifiedMonth = LastModifiedDateTime.dt.month
LastModifiedDay = LastModifiedDateTime.dt.day
LastModifiedDayofYear = LastModifiedDateTime.dt.dayofyear
LastModifiedDayofWeek = LastModifiedDateTime.dt.dayofweek
LastModifiedWeekofYear = LastModifiedDateTime.dt.weekofyear
LastModifiedQuarter = LastModifiedDateTime.dt.quarter
fig = plt.figure(dpi = 100, figsize = (30, 60))
ax1 = fig.add_subplot(4, 2, 1)
ax1.plot = sns.distplot(LastModifiedYear[~ np.isnan(LastModifiedYear)])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Last Modified Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax2 = fig.add_subplot(4, 2, 2)
ax2.plot = sns.distplot(LastModifiedMonth[~ np.isnan(LastModifiedMonth)])
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel("Last Modified Month", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax4 = fig.add_subplot(4, 2, 3)
ax4.plot = sns.distplot(LastModifiedDay[~ np.isnan(LastModifiedDay)])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Last Modified Day", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax5 = fig.add_subplot(4, 2, 4)
ax5.plot = sns.distplot(LastModifiedDayofYear[~ np.isnan(LastModifiedDayofYear)])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Last Modified Day of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax3 = fig.add_subplot(4, 2, 5)
ax3.plot = sns.distplot(LastModifiedDayofWeek[~ np.isnan(LastModifiedDayofWeek)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Last Modified Day of Week", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax3 = fig.add_subplot(4, 2, 6)
ax3.plot = sns.distplot(LastModifiedWeekofYear[~ np.isnan(LastModifiedWeekofYear)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Last Modified Week of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax3 = fig.add_subplot(4, 2, 7)
ax3.plot = sns.distplot(LastModifiedQuarter[~ np.isnan(LastModifiedQuarter)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Last Modified Quarter", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
fig.show()
# CreatedDateTime
CreatedDateTime = pd.to_datetime(stop_search['CreatedDateTime'])
CreatedYear = CreatedDateTime.dt.year
CreatedMonth = CreatedDateTime.dt.month
CreatedDay = CreatedDateTime.dt.day
CreatedDayofYear = CreatedDateTime.dt.dayofyear
CreatedDayofWeek = CreatedDateTime.dt.dayofweek
CreatedWeekofYear = CreatedDateTime.dt.weekofyear
CreatedQuarter = CreatedDateTime.dt.quarter
fig = plt.figure(dpi = 100, figsize = (30, 60))
ax1 = fig.add_subplot(4, 2, 1)
ax1.plot = sns.distplot(CreatedYear[~ np.isnan(CreatedYear)])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Created Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax2 = fig.add_subplot(4, 2, 2)
ax2.plot = sns.distplot(CreatedMonth[~ np.isnan(CreatedMonth)])
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel("Created Month", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax4 = fig.add_subplot(4, 2, 3)
ax4.plot = sns.distplot(CreatedDay[~ np.isnan(CreatedDay)])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Created Day", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax5 = fig.add_subplot(4, 2, 4)
ax5.plot = sns.distplot(CreatedDayofYear[~ np.isnan(CreatedDayofYear)])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Created Day of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax3 = fig.add_subplot(4, 2, 5)
ax3.plot = sns.distplot(CreatedDayofWeek[~ np.isnan(CreatedDayofWeek)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Created Day of Week", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax3 = fig.add_subplot(4, 2, 6)
ax3.plot = sns.distplot(CreatedWeekofYear[~ np.isnan(CreatedWeekofYear)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Created Week of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax3 = fig.add_subplot(4, 2, 7)
ax3.plot = sns.distplot(CreatedQuarter[~ np.isnan(CreatedQuarter)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Created Quarter", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
fig.show()
EventWeekofYear.value_counts()
# EventDate
EventDate = pd.to_datetime(stop_search['EventDate'])
EventYear = EventDate.dt.year
EventMonth = EventDate.dt.month
EventDay = EventDate.dt.day
EventDayofYear = EventDate.dt.dayofyear
EventDayofWeek = EventDate.dt.dayofweek
EventWeekofYear = EventDate.dt.weekofyear
EventQuarter = EventDate.dt.quarter
EventHour = EventDate.dt.hour
EventMinute = EventDate.dt.minute
fig = plt.figure(dpi = 100, figsize = (50, 50))
ax1 = fig.add_subplot(3, 3, 1)
ax1.plot = sns.distplot(EventYear[~ np.isnan(EventYear)])
sns.plt.xlim(2010, 2016)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax2 = fig.add_subplot(3, 3, 2)
ax2.plot = sns.distplot(EventMonth[~ np.isnan(EventMonth)])
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel("Event Month", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax3 = fig.add_subplot(3, 3, 3)
ax3.plot = sns.distplot(EventDay[~ np.isnan(EventDay)])
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Day", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax4 = fig.add_subplot(3, 3, 4)
ax4.plot = sns.distplot(EventDayofYear[~ np.isnan(EventDayofYear)])
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Event Day of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 40)
ax5 = fig.add_subplot(3, 3, 5)
ax5.plot = sns.distplot(EventDayofWeek[~ np.isnan(EventDayofWeek)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Day of Week", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax6 = fig.add_subplot(3, 3, 6)
ax6.plot = sns.distplot(EventWeekofYear[~ np.isnan(EventWeekofYear)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Event Week of Year", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax7 = fig.add_subplot(3, 3, 7)
ax7.plot = sns.distplot(EventQuarter[~ np.isnan(EventQuarter)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Quarter", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax8 = fig.add_subplot(3, 3, 8)
ax8.plot = sns.distplot(EventHour[~ np.isnan(EventHour)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Hour", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
ax9 = fig.add_subplot(3, 3, 9)
ax9.plot = sns.distplot(EventMinute[~ np.isnan(EventMinute)])
# sns.plt.xlim(1975, 2020)
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.xlabel("Event Minute", fontsize = 40)
plt.ylabel("Probability", fontsize = 50)
fig.show()
(5) StopType - StopDescription
print(stop_search["StopDescription"].value_counts())
plot = sns.countplot(stop_search["StopDescription"])
for label in plot.get_xticklabels():
label.set_rotation(60)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.xlabel("Stop Type", fontsize = 30)
plt.ylabel("Count", fontsize = 30)
(6) Stop result - ActionsTaken
print(stop_search["ActionsTaken"].value_counts())
(7) Time series analysis - EventData
event_time = pd.DataFrame({'Year': EventYear,
'Quarter': EventQuarter,
'Month': EventMonth,
'WeekofYear': EventWeekofYear,
'Day': EventDay,
'DayofYear': EventDayofYear,
'DayofWeek': EventDayofWeek,
'Hour': EventHour,
'Minute': EventMinute,
'Number': np.repeat(1, stop_search.shape[0])
}, index = None)
event_time.head()
# aggregate data by year, month, day
ymd = pd.to_datetime(event_time.Year*10000 + event_time.Month*100 + event_time.Day, format='%Y%m%d')
ymd_data = pd.DataFrame({'Date': ymd,
'Number': np.repeat(1, stop_search.shape[0])
}, index = None)
agg_ymd = ymd_data.groupby(['Date'])['Number'].sum()
index = agg_ymd.index
agg_ymd = pd.DataFrame({'Count': agg_ymd
# 'Date': index
}, index = None)
# agg_ymd.Date = pd.to_datetime(agg_ymd.Date)
agg_ymd.set_index(index)
print(ymd_data.shape)
print(ymd_data.head())
print(agg_ymd.shape)
print(agg_ymd.head())
from pandas import Series
timeData = Series(agg_ymd.Count, index=index)
timeData['2010':'2016'].plot()
timeData['2016-01-01':'2016-12-31'].plot()
timeData['2016-12-01':'2016-12-31'].plot()
timeData[timeData > 400]